# =============================================================================================  
#=================================== Lib Functions =======================================
# =============================================================================================  


# ========================= Gen design mat ===========================
#return design matrix of size n x p, whose rows are independent Gaussian with cov decay r
genDesignMat <- function(n, p, r){
  V <- matrix(0, nrow=p, ncol=p)
  for (i in 1:p){
    for (j in 1:p){
      V[i,j] <-  r^(abs(i-j))
    }
  }
  mvrnorm(n, mu = rep(0,p), Sigma = V)
}
#genDesignMat(1,4,0.5)



# ========================= bivariate EF ===========================

#est. params 
#dat: nx2 data
est_EF_param <- function(dat){
  Y1 <- dat[,1,drop=FALSE]
  Y2 <- dat[,2,drop=FALSE]
  n <- nrow(Y1)
  a1 <- matrix(rep(0, 6*n), ncol = 6, nrow = n) 
  a2 <- matrix(rep(0, 6*n), ncol = 6, nrow = n) 
  a3 <- matrix(rep(0, 6*n), ncol = 6, nrow = n) 
  a4 <- matrix(rep(0, 6*n), ncol = 6, nrow = n) 
  a1[,1] <- 2 * Y1 * Y2^2 
  a1[,2] <- 2 * Y1
  a1[,4] <- Y2
  a1[,5] <- rep(1, n)
  a2[,1] <- 2 * Y1^2 * Y2
  a2[,3] <- 2 * Y2
  a2[,4] <- Y1
  a2[,6] <- rep(1, n)
  a3[,1] <- 2 * Y2^2 
  a3[,2] <- 2 * rep(1,n)
  a4[,1] <- 2 * Y1^2 
  a4[,3] <- 2 * rep(1,n) 
  A <- ( t(a1) %*% a1 + t(a2) %*% a2 )/n
  b <- ( matrix(colSums(a3),ncol=1)+matrix(colSums(a4),ncol=1) )/n
  theta <- - solve( A ) %*% b
  #make sure it is integrable ... 
  if (theta[1] >= 0){ 
    theta[1] <- 0
    if (theta[2] >= 0){
      theta[2] <- 0
      theta[4] <- 0
      theta[5] <- 0
    }
    if (theta[3] >= 0){
      theta[3] <- 0
      theta[4] <- 0
      theta[6] <- 0
    }
    if (theta[2] * theta[3] - theta[4]^2/4 <= 0){ #check positiveness of the cov matrix 
      theta[4] <- 0 
    }
  }
  theta
}

library(stats4)
est_MLE_EF_param <- function(dat){
  n <- nrow(dat)
  log_lik <- function(theta){ #(theta1,theta2,theta3,theta4,theta5,theta6) {
    densi_fun <- function(x){
      exp( theta[1] * x[1]^2 * x[2]^2 + theta[2] * x[1]^2 + theta[3] * x[2]^2+ theta[4] * x[1] * x[2] +
           theta[5] * x[1] + theta[6] * x[2])
    }
    samp <- MCMC(p.log_para, n=1000, init=c(0, 1), scale=c(1, 0.1), adapt=TRUE, acc.rate=0.3) #n=500 is the original version 
    const <- hcubature(densi_fun, rep(-10,2), rep(10,2), tol=1e-4)$integral
    
    #l <- mean(theta1 * dat[,1]^2 * dat[,2]^2 + theta2 * dat[,1]^2 + theta3 * dat[,2]^2+ theta4 * dat[,1] * dat[,2] +
    #  theta5 * dat[,1] + theta6 * dat[,2]) 
    l <- - mean(theta[1] * dat[,1]^2 * dat[,2]^2 + theta[2] * dat[,1]^2 + theta[3] * dat[,2]^2+ theta[4] * dat[,1] * dat[,2] +
                theta[5] * dat[,1] + theta[6] * dat[,2]) + log(const) 
    l
  }
  #fit_mle <- mle(log_lik, method = "BFGS", start = list(theta = c(theta1=0,theta2=-1,theta3=-1,theta4=0,theta5=0,theta6=0)))
  fit1 <- optim(c(0,-1,-1,0,0,0), log_lik, hessian = FALSE, method = "BFGS", control = list(maxit=10))
  
  theta <- fit1$par

  #make sure it is integrable ... 
  if (theta[1] >= 0){ 
    theta[1] <- 0
    if (theta[2] >= 0){
      theta[2] <- 0
      theta[4] <- 0
      theta[5] <- 0
    }
    if (theta[3] >= 0){
      theta[3] <- 0
      theta[4] <- 0
      theta[6] <- 0
    }
    if (theta[2] * theta[3] - theta[4]^2/4 <= 0){ #check positiveness of the cov matrix 
      theta[4] <- 0 
    }
  }
  theta
}


#calculate_entr:  calculate gradient entropy and ultimately return mutual info based on fitted model  
#note that it does not depend on the order of Y1 Y2
calculate_entr <- function(theta){
  #generate simulated data 
  p.log_para <- function(x) {
    theta[1] * x[1]^2 * x[2]^2 + theta[2] * x[1]^2 + theta[3] * x[2]^2+ theta[4] * x[1] * x[2] +
      theta[5] * x[1] + theta[6] * x[2] 
  }
  samp <- MCMC(p.log_para, n=1000, init=c(0, 1), scale=c(1, 0.1), adapt=TRUE, acc.rate=0.3) #n=500 is the original version 
  dat <- samp$samples
  dat <- dat[seq(1,nrow(dat),1),]
  Y1 <- dat[,1,drop=FALSE]
  Y2 <- dat[,2,drop=FALSE]
  n <- nrow(Y1)
  a1 <- matrix(rep(0, 6*n), ncol = 6, nrow = n) 
  a2 <- matrix(rep(0, 6*n), ncol = 6, nrow = n) 
  a1[,1] <- 2 * Y1 * Y2^2 
  a1[,2] <- 2 * Y1
  a1[,4] <- Y2
  a1[,5] <- rep(1, n)
  a2[,1] <- 2 * Y1^2 * Y2
  a2[,3] <- 2 * Y2
  a2[,4] <- Y1
  a2[,6] <- rep(1, n)
  
  #joint entropy 
  entr_Y12 <- -0.5 * ( sum((a1 %*% theta)^2) + sum((a2 %*% theta)^2) )/n
  #Y1 entropy 
  temp1 <- -theta[1] * Y2 / (theta[1] * Y2^2 + theta[3]) + 
    0.5 * theta[1] * Y2 * (theta[4] * Y2 + theta[6])^2 / (theta[1] * Y2^2 + theta[3])^2 -
    0.5 * theta[4] * (theta[4] * Y2 + theta[6]) / (theta[1] * Y2^2 + theta[3]) + 
    2 * theta[2] * Y2 + theta[6]
  entr_Y1 <- -0.5 * sum(temp1^2) /n
  #Y2 entropy 
  temp2 <- -theta[1] * Y2 / (theta[1] * Y2^2 + theta[2]) + 
    0.5 * theta[1] * Y2 * (theta[4] * Y2 + theta[5])^2 / (theta[1] * Y2^2 + theta[2])^2 -
    0.5 * theta[4] * (theta[4] * Y2 + theta[5]) / (theta[1] * Y2^2 + theta[2]) + 
    2 * theta[3] * Y2 + theta[5]
  entr_Y2 <- -0.5 * sum(temp2^2) /n
  muInfo <- max(entr_Y1 + entr_Y2 - entr_Y12, 0)
  muInfo
}

#install.packages('cubature')
library('cubature')
#testFn0 <- function(x){(2*pi)^{-1}*exp(-sum(x^2)/2)}
#hcubature(testFn0, rep(-5,2), rep(5,2), tol=1e-4)
#for computational comparison only
calculate_Shannon_entr <- function(theta){
  #generate simulated data 
  p.log_para <- function(x) {
    theta[1] * x[1]^2 * x[2]^2 + theta[2] * x[1]^2 + theta[3] * x[2]^2+ theta[4] * x[1] * x[2] +
      theta[5] * x[1] + theta[6] * x[2]  
  }
  fun <- function(x){
    exp(p.log_para(x))
  }
  samp <- MCMC(p.log_para, n=1000, init=c(0, 1), scale=c(1, 0.1), adapt=TRUE, acc.rate=0.3) #n=500 is the original version 
  const <- hcubature(fun, rep(-10,2), rep(10,2), tol=1e-4)$integral
  dat <- samp$samples
  dat <- dat[seq(1,nrow(dat),1),]
  Y1 <- dat[,1,drop=FALSE]
  Y2 <- dat[,2,drop=FALSE]
  n <- nrow(Y1)
  
  #joint entropy 
  temp12 <- 0
  temp1 <- 0
  temp2 <- 0
  for (k in 1:n){
    temp12 <- temp12 - p.log_para(dat[k,])/n
    temp1 <- temp1 - p.log_para(dat[k,])/n  
    temp2 <- temp2 - p.log_para(dat[k,])/n 
  }
  entr_Y12 <- temp12 + log(const)
  entr_Y1 <- temp1 + log(const)
  entr_Y2 <- temp2 + log(const)
  muInfo <- max(entr_Y1 + entr_Y2 - entr_Y12, 0)
  muInfo
}



#log conditional density of Y1 on Y2, given the parameter 
#index of Y1 is smaller than index of Y2
cond_log_density <- function(Ys, ind1, ind2, theta){
  Y1 <- Ys[,ind1,drop = FALSE]
  Y2 <- Ys[,ind2,drop = FALSE]
  #if switch is true, then shuffle theta to accomodate the order of Y1 and Y2 (in representation using theta) 
  if (ind1 > ind2){ #switch is true
    theta_s <- c(theta[1], theta[3], theta[2], theta[4], theta[6], theta[5])
  }else{
    theta_s <- theta
  }
  log_den <- 0.5 * log(-theta_s[1] * Y2^2 - theta_s[2]) + theta_s[1] * Y1^2 * Y2^2 + 
    theta_s[2] * Y1^2 +  theta_s[4] * Y1 * Y2 + theta_s[5] * Y1 + (theta_s[4] * Y2 + theta_s[5])^2 / 4 / (theta_s[1] * Y2^2 + theta_s[2])
}

#marginal density of Y1, calculated from the joint distribution of Y1 Y2 
#index of Y1 is smaller than index of Y2
#log_density <- function(Y1, theta, switch){
#   if (switch){ 
#     theta_s <- c(theta[1], theta[3], theta[2], theta[4], theta[6], theta[5])
#   }else{
#     theta_s <- theta
#   }
#   log_den <- -0.5 * log(-theta_s[1] * Y1^2 - theta_s[3]) - 
#     (theta_s[4] * Y1 + theta_s[6])^2 / 4 / (theta_s[1] * Y1^2 + theta_s[3]) +
#     theta_s[2] * Y1^2 + theta_s[5] * Y1
# }
#Newer version: directly calculate from empirical density 
log_density <- function(Y){
  #s2 <- var(as.numeric(Y))
  s2 <- 1
  #mu <- mean(as.numeric(Y))
  mu <- 0
  #n <- length(Y)
  -0.5 * log(s2) * n - 0.5 * (Y-mu)^2 / s2
}


#find C-L tree based on gradient information
#dat: n x p matrix of p variables 
CL_tree <- function(dat){
  p <- ncol(dat)
  theta_fac <- array(0, dim=c(p,p,6))
  #pb <- txtProgressBar(...)
  W <- matrix(rep(0, p^2), nrow = p, ncol = p)
  for (j in 1:(p-1)){
    for (k in (j+1):p){
      theta <- est_EF_param(dat[,c(j,k)])
      invisible(capture.output( W[j,k] <- calculate_entr(theta) + 10^(-6) )) #the perturbation is for graph processing convenience 
      theta_fac[j,k,] <- theta
      theta_fac[k,j,] <- theta
    }
    print('processed: ') 
    print((2*p-j-1)*j/(p*(p-1)))
    #setTxtProgressBar(pb, j)
  }
  W <- W + t(W)  
  ## create a weighted undirectional graph from the weight matrix
  gr <- graph_from_adjacency_matrix(-W, weighted = TRUE, mode = "undirected")
  ## find the minimum spanning tree
  mst <- mst(gr)
  ## create pairs of adjacent nodes 
  edges <- matrix( as.numeric(get.edgelist(mst)), ncol=2 )
  
  #par(mfrow=c(1,2), mar=c(0,1,0.75,0)) # sub-plots and margins
  #plot(gr , main="Graph")
  plot(mst, main = "MST")
  
  #calculate the root and the center
  ecc <- eccentricity(mst,mode="all")
  root <- which.max(ecc)
  cen <- which.min(ecc)
  
  #Numeric vector. The vertex ids, in the order in which they were visited by the search
  #from http://igraph.org/r/doc/dfs.html
  ord <- dfs(mst, root=root, order=TRUE)$order 
  rank <- rep(NA, p) #higher ranked Y and lower Y': cal the conditional Y'|Y  
  for (k in 1:p){
    rank[ord[k]] <- k
  }
  
  list(mst = mst, W = W, edges = edges, root = root, cen = cen, rank = rank, theta_fac = theta_fac)
  
}



CL_Shannon_tree <- function(dat){
  p <- ncol(dat)
  theta_fac <- array(0, dim=c(p,p,6))
  #pb <- txtProgressBar(...)
  W <- matrix(rep(0, p^2), nrow = p, ncol = p)
  for (j in 1:(p-1)){
    for (k in (j+1):p){
      theta <- est_MLE_EF_param(dat[,c(j,k)])
      invisible(capture.output( W[j,k] <- calculate_Shannon_entr(theta) + 10^(-6) )) #the perturbation is for graph processing convenience 
      theta_fac[j,k,] <- theta
      theta_fac[k,j,] <- theta
    }
    print('processed: ') 
    print((2*p-j-1)*j/(p*(p-1)))
    #setTxtProgressBar(pb, j)
  }
  W <- W + t(W)  
  ## create a weighted undirectional graph from the weight matrix
  gr <- graph_from_adjacency_matrix(-W, weighted = TRUE, mode = "undirected")
  ## find the minimum spanning tree
  mst <- mst(gr)
  ## create pairs of adjacent nodes 
  edges <- matrix( as.numeric(get.edgelist(mst)), ncol=2 )
  
  #par(mfrow=c(1,2), mar=c(0,1,0.75,0)) # sub-plots and margins
  #plot(gr , main="Graph")
  plot(mst, main = "MST")
  
  #calculate the root and the center
  ecc <- eccentricity(mst,mode="all")
  root <- which.max(ecc)
  cen <- which.min(ecc)
  
  #Numeric vector. The vertex ids, in the order in which they were visited by the search
  #from http://igraph.org/r/doc/dfs.html
  ord <- dfs(mst, root=root, order=TRUE)$order 
  rank <- rep(NA, p) #higher ranked Y and lower Y': cal the conditional Y'|Y  
  for (k in 1:p){
    rank[ord[k]] <- k
  }
  
  list(mst = mst, W = W, edges = edges, root = root, cen = cen, rank = rank, theta_fac = theta_fac)
  
}


predict_density <- function(cl_tree, X_test){
  edges <- cl_tree$edges
  root <- cl_tree$root
  rank <- cl_tree$rank
  theta_fac <- cl_tree$theta_fac
  log_den <- log_density(X_test[,root])
  p <- ncol(X_test)
  for (k in 1:(p-1)){
    ind1 <- edges[k,1]
    ind2 <- edges[k,2]
    theta <- theta_fac[ind1, ind2, ] 
    if (rank[ind1] < rank[ind2]){#if use ind1 conditional on ind2 
      log_den <- log_den + cond_log_density(X_test, ind1, ind2, theta)
    }else{
      log_den <- log_den + cond_log_density(X_test, ind2, ind1, theta)
    }
  }
  log_den
}


